import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current sessio
ruta = r'C:\MachineLearning\Dataset\TornadosDataset\tornados.csv.zip'  # Ruta corregida con r para raw string
df = pd.read_csv(ruta)  # Solo cargamos el archivo sin ningún procesamiento adicional
df.head(10)
om yr mo dy date time tz datetime_utc st stf ... elon len wid ns sn f1 f2 f3 f4 fc
0 192 1950 10 1 1950-10-01 21:00:00 America/Chicago 1950-10-02T03:00:00Z OK 40 ... -102.30 15.8 10 1 1 25 0 0 0 False
1 193 1950 10 9 1950-10-09 02:15:00 America/Chicago 1950-10-09T08:15:00Z NC 37 ... 0.00 2.0 880 1 1 47 0 0 0 False
2 195 1950 11 20 1950-11-20 02:20:00 America/Chicago 1950-11-20T08:20:00Z KY 21 ... 0.00 0.1 10 1 1 177 0 0 0 False
3 196 1950 11 20 1950-11-20 04:00:00 America/Chicago 1950-11-20T10:00:00Z KY 21 ... 0.00 0.1 10 1 1 209 0 0 0 False
4 197 1950 11 20 1950-11-20 07:30:00 America/Chicago 1950-11-20T13:30:00Z MS 28 ... 0.00 2.0 37 1 1 101 0 0 0 False
5 194 1950 11 4 1950-11-04 17:00:00 America/Chicago 1950-11-04T23:00:00Z PA 42 ... -75.93 15.9 100 1 1 71 11 0 0 False
6 198 1950 12 2 1950-12-02 15:00:00 America/Chicago 1950-12-02T21:00:00Z IL 17 ... -89.72 18.8 50 1 1 119 117 0 0 False
7 199 1950 12 2 1950-12-02 16:00:00 America/Chicago 1950-12-02T22:00:00Z IL 17 ... -89.38 18.0 200 1 1 119 5 0 0 False
8 200 1950 12 2 1950-12-02 16:25:00 America/Chicago 1950-12-02T22:25:00Z AR 5 ... -91.72 7.8 10 1 1 65 0 0 0 False
9 201 1950 12 2 1950-12-02 17:30:00 America/Chicago 1950-12-02T23:30:00Z IL 17 ... -89.62 9.6 50 1 1 157 0 0 0 False

10 rows × 27 columns

df.shape
(68693, 27)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68693 entries, 0 to 68692
Data columns (total 27 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   om            68693 non-null  int64  
 1   yr            68693 non-null  int64  
 2   mo            68693 non-null  int64  
 3   dy            68693 non-null  int64  
 4   date          68693 non-null  object 
 5   time          68693 non-null  object 
 6   tz            68693 non-null  object 
 7   datetime_utc  68693 non-null  object 
 8   st            68693 non-null  object 
 9   stf           68693 non-null  int64  
 10  mag           67937 non-null  float64
 11  inj           68693 non-null  int64  
 12  fat           68693 non-null  int64  
 13  loss          41523 non-null  float64
 14  slat          68693 non-null  float64
 15  slon          68693 non-null  float64
 16  elat          68693 non-null  float64
 17  elon          68693 non-null  float64
 18  len           68693 non-null  float64
 19  wid           68693 non-null  int64  
 20  ns            68693 non-null  int64  
 21  sn            68693 non-null  int64  
 22  f1            68693 non-null  int64  
 23  f2            68693 non-null  int64  
 24  f3            68693 non-null  int64  
 25  f4            68693 non-null  int64  
 26  fc            68693 non-null  bool   
dtypes: bool(1), float64(7), int64(14), object(5)
memory usage: 13.7+ MB
df.describe().T
count mean std min 25% 50% 75% max
om 68693.0 1.132018e+05 2.266220e+05 1.0000 285.00 588.0000 1118.00 6.220800e+05
yr 68693.0 1.991854e+03 1.956516e+01 1950.0000 1976.00 1995.0000 2008.00 2.022000e+03
mo 68693.0 5.968541e+00 2.444656e+00 1.0000 4.00 6.0000 7.00 1.200000e+01
dy 68693.0 1.593088e+01 8.750070e+00 1.0000 8.00 16.0000 24.00 3.100000e+01
stf 68693.0 2.922026e+01 1.501327e+01 1.0000 18.00 28.0000 42.00 7.800000e+01
mag 67937.0 7.787215e-01 8.957898e-01 0.0000 0.00 1.0000 1.00 5.000000e+00
inj 68693.0 1.418689e+00 1.811475e+01 0.0000 0.00 0.0000 0.00 1.740000e+03
fat 68693.0 8.931041e-02 1.472120e+00 0.0000 0.00 0.0000 0.00 1.580000e+02
loss 41523.0 2.020898e+06 3.039588e+07 50.0000 10000.00 50000.0000 500000.00 2.800100e+09
slat 68693.0 3.712939e+01 5.099005e+00 17.7212 33.18 37.0000 40.92 6.102000e+01
slon 68693.0 -9.276149e+01 8.672112e+00 -163.5300 -98.42 -93.5552 -86.73 -6.471510e+01
elat 68693.0 2.296065e+01 1.852814e+01 0.0000 0.00 32.5500 38.65 6.102000e+01
elon 68693.0 -5.683609e+01 4.534073e+01 -163.5300 -94.78 -84.7200 0.00 0.000000e+00
len 68693.0 3.489270e+00 8.247115e+00 0.0000 0.12 0.8000 3.21 2.347000e+02
wid 68693.0 1.077676e+02 2.068513e+02 0.0000 20.00 50.0000 100.00 4.576000e+03
ns 68693.0 1.008764e+00 9.505967e-02 1.0000 1.00 1.0000 1.00 3.000000e+00
sn 68693.0 9.914111e-01 9.227835e-02 0.0000 1.00 1.0000 1.00 1.000000e+00
f1 68693.0 1.046440e+02 9.675030e+01 0.0000 37.00 85.0000 137.00 8.100000e+02
f2 68693.0 8.605142e+00 3.810602e+01 0.0000 0.00 0.0000 0.00 8.200000e+02
f3 68693.0 1.687202e+00 1.668166e+01 0.0000 0.00 0.0000 0.00 7.100000e+02
f4 68693.0 5.067911e-01 9.163194e+00 0.0000 0.00 0.0000 0.00 5.070000e+02
df.isnull().sum()
om                  0
yr                  0
mo                  0
dy                  0
date                0
time                0
tz                  0
datetime_utc        0
st                  0
stf                 0
mag               756
inj                 0
fat                 0
loss            27170
slat                0
slon                0
elat                0
elon                0
len                 0
wid                 0
ns                  0
sn                  0
f1                  0
f2                  0
f3                  0
f4                  0
fc                  0
dtype: int64
# Drop rows with missing values
df.dropna(inplace=True)
df['date'] = pd.to_datetime(df['date'])

# Calculate the number of days since a reference date (e.g., the minimum date in the column)
df['date_numeric'] = (df['date'] - df['date'].min()).dt.days
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S')

# Extract the time in seconds since midnight and store it in a new column 'time_numeric'
df['time_numeric'] = df['time'].dt.hour * 3600 + df['time'].dt.minute * 60 + df['time'].dt.second
# List of numeric column names
numeric_columns = ['om', 'yr', 'mo', 'dy', 'stf', 'mag', 'inj', 'fat', 'loss', 'slat', 'slon', 'elat', 'elon', 'len', 'wid', 'ns', 'sn', 'f1', 'f2', 'f3', 'f4']

# Create a DataFrame with only the numeric columns
numeric_df = df[numeric_columns]

# Calculate the correlation matrix
correlation_matrix = numeric_df.corr()

# Print or use the correlation_matrix as needed
print(correlation_matrix)
            om        yr        mo        dy       stf       mag       inj  \
om    1.000000  0.641333 -0.014129  0.038291  0.017673 -0.123721 -0.018815   
yr    0.641333  1.000000  0.011081  0.020438 -0.011882 -0.270327 -0.035231   
mo   -0.014129  0.011081  1.000000 -0.022037  0.017126 -0.053477 -0.023666   
dy    0.038291  0.020438 -0.022037  1.000000  0.007341 -0.010681 -0.009103   
stf   0.017673 -0.011882  0.017126  0.007341  1.000000  0.005377 -0.011306   
mag  -0.123721 -0.270327 -0.053477 -0.010681  0.005377  1.000000  0.230651   
inj  -0.018815 -0.035231 -0.023666 -0.009103 -0.011306  0.230651  1.000000   
fat  -0.007449 -0.027722 -0.025170 -0.003246 -0.009078  0.207488  0.757170   
loss  0.011247  0.019826 -0.009462 -0.001654 -0.005006  0.147667  0.528723   
slat -0.036809 -0.070104  0.057358 -0.002550  0.153543  0.065990 -0.003999   
slon  0.076815  0.108611  0.038201 -0.010476 -0.118155 -0.030855  0.017725   
elat  0.352794  0.512345 -0.021748  0.008169  0.012422  0.135053  0.058227   
elon -0.356862 -0.523312  0.036354 -0.011956  0.001557 -0.127254 -0.055493   
len   0.006556 -0.037387 -0.054607 -0.002215 -0.027625  0.415129  0.252756   
wid   0.166654  0.151644 -0.068346  0.014518  0.001584  0.384436  0.193788   
ns    0.041793  0.039080 -0.014718  0.004426 -0.007908  0.124138  0.115227   
sn   -0.041965 -0.039049  0.015142 -0.005649  0.007918 -0.124903 -0.117668   
f1    0.010428  0.001086 -0.036121  0.002581  0.233779 -0.004434 -0.014978   
f2    0.045533  0.031565 -0.036306 -0.003671  0.017878  0.203320  0.074652   
f3    0.009687 -0.012851 -0.029028  0.005268 -0.010504  0.157598  0.096212   
f4   -0.002056 -0.019453 -0.024600  0.002862 -0.006223  0.109177  0.074728   

           fat      loss      slat  ...      elat      elon       len  \
om   -0.007449  0.011247 -0.036809  ...  0.352794 -0.356862  0.006556   
yr   -0.027722  0.019826 -0.070104  ...  0.512345 -0.523312 -0.037387   
mo   -0.025170 -0.009462  0.057358  ... -0.021748  0.036354 -0.054607   
dy   -0.003246 -0.001654 -0.002550  ...  0.008169 -0.011956 -0.002215   
stf  -0.009078 -0.005006  0.153543  ...  0.012422  0.001557 -0.027625   
mag   0.207488  0.147667  0.065990  ...  0.135053 -0.127254  0.415129   
inj   0.757170  0.528723 -0.003999  ...  0.058227 -0.055493  0.252756   
fat   1.000000  0.471312 -0.006840  ...  0.049496 -0.048960  0.235652   
loss  0.471312  1.000000 -0.000465  ...  0.043983 -0.043704  0.154692   
slat -0.006840 -0.000465  1.000000  ...  0.134238  0.003470  0.027393   
slon  0.009747  0.004238 -0.123253  ...  0.057484  0.027445 -0.014806   
elat  0.049496  0.043983  0.134238  ...  1.000000 -0.973359  0.344643   
elon -0.048960 -0.043704  0.003470  ... -0.973359  1.000000 -0.341728   
len   0.235652  0.154692  0.027393  ...  0.344643 -0.341728  1.000000   
wid   0.184129  0.183187 -0.005657  ...  0.265341 -0.273972  0.361725   
ns    0.117815  0.049873  0.013577  ...  0.095309 -0.089331  0.263836   
sn   -0.119968 -0.051055 -0.013804  ... -0.096143  0.090145 -0.265096   
f1   -0.010800 -0.004242 -0.190463  ... -0.028823 -0.009024 -0.031833   
f2    0.052892  0.057888 -0.008680  ...  0.200617 -0.205818  0.360852   
f3    0.080189  0.079589 -0.006792  ...  0.096825 -0.097735  0.413864   
f4    0.059851  0.068563 -0.002527  ...  0.055573 -0.055220  0.383734   

           wid        ns        sn        f1        f2        f3        f4  
om    0.166654  0.041793 -0.041965  0.010428  0.045533  0.009687 -0.002056  
yr    0.151644  0.039080 -0.039049  0.001086  0.031565 -0.012851 -0.019453  
mo   -0.068346 -0.014718  0.015142 -0.036121 -0.036306 -0.029028 -0.024600  
dy    0.014518  0.004426 -0.005649  0.002581 -0.003671  0.005268  0.002862  
stf   0.001584 -0.007908  0.007918  0.233779  0.017878 -0.010504 -0.006223  
mag   0.384436  0.124138 -0.124903 -0.004434  0.203320  0.157598  0.109177  
inj   0.193788  0.115227 -0.117668 -0.014978  0.074652  0.096212  0.074728  
fat   0.184129  0.117815 -0.119968 -0.010800  0.052892  0.080189  0.059851  
loss  0.183187  0.049873 -0.051055 -0.004242  0.057888  0.079589  0.068563  
slat -0.005657  0.013577 -0.013804 -0.190463 -0.008680 -0.006792 -0.002527  
slon -0.015333  0.014608 -0.014400 -0.111574 -0.002058  0.002593 -0.001872  
elat  0.265341  0.095309 -0.096143 -0.028823  0.200617  0.096825  0.055573  
elon -0.273972 -0.089331  0.090145 -0.009024 -0.205818 -0.097735 -0.055220  
len   0.361725  0.263836 -0.265096 -0.031833  0.360852  0.413864  0.383734  
wid   1.000000  0.145225 -0.146145  0.004304  0.182506  0.152638  0.115759  
ns    0.145225  1.000000 -0.991692 -0.120844 -0.022475 -0.009025 -0.006423  
sn   -0.146145 -0.991692  1.000000  0.121857  0.023013  0.009024  0.006458  
f1    0.004304 -0.120844  0.121857  1.000000  0.081928  0.038337  0.024276  
f2    0.182506 -0.022475  0.023013  0.081928  1.000000  0.325328  0.170388  
f3    0.152638 -0.009025  0.009024  0.038337  0.325328  1.000000  0.434265  
f4    0.115759 -0.006423  0.006458  0.024276  0.170388  0.434265  1.000000  

[21 rows x 21 columns]

Análisis Exploratorio de Datos#

sns.heatmap(correlation_matrix,fmt='.2f',cmap='Blues')
plt.show

sns.clustermap(correlation_matrix,fmt='.2f',cmap='Blues')
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
_images/ddc835129ba353e2d1f90c931b693b6e82727337b70a4ff01f5909e9bd9487eb.png _images/821315bf29368128888a83d32f9f150794e0b2002b0f4a26fc6db1e9760d920a.png

Análisis Univariado#

# Scatter plot for two numeric columns (e.g., 'mag' vs. 'loss')
plt.scatter(df['mag'], df['loss'])
plt.xlabel('Magnitude')
plt.ylabel('Loss')
plt.title('Scatter Plot: Magnitude vs. Loss')
plt.show()

# Box plot to visualize the distribution of a numeric column by a categorical column (e.g., 'st' vs. 'mag')
plt.figure(figsize=(10, 7))
sns.boxplot(data=df, x='st', y='mag')
plt.xlabel('State')
plt.ylabel('Magnitude')
plt.title('Box Plot: Magnitude by State')
plt.xticks(rotation=90)
plt.show()
_images/f08eeb7e5078108e4c7ecfaaca884cc70b2631932a072e54e069e35bd3f5a118.png _images/ad0bfca05c0a77dffd1eba152152a4ce350127785bf989eb21fe6a747e0b1e27.png
# Histogram for a numeric column (e.g., 'mag')
plt.hist(df['mag'], bins=20)
plt.xlabel('Magnitude')
plt.ylabel('Frequency')
plt.title('Histogram of Magnitude')
plt.show()

# Bar plot for a categorical column (e.g., 'st')
plt.figure(figsize=(10, 7))
sns.countplot(data=df, x='st')
plt.xlabel('State')
plt.ylabel('Count')
plt.title('Counts by State')
plt.xticks(rotation=90)
plt.show()
_images/2bb252ff7b8c3af7f2a7a878840df92b112e0b2720227cb2969c123414c012f4.png _images/7adc711667abe408b8928d8ced72a9fe7faebd17ade18aa4f2615a7bda8536f9.png
fig = px.pie(df, names='st', title='Distribution of States')
fig.show()
fig = px.box(df, x='mag', title='Distribution of Magnitude')
fig.show()
# Select a subset of numeric columns for pair plotting
subset_columns = ['mag', 'loss', 'len', 'wid', 'inj']

# Create a pair plot for selected columns
fig = px.scatter_matrix(df, dimensions=subset_columns, title='Pair Plot of Numeric Variables')
fig.show()
# count the injuries in each year and plot the highest 10
injuries_sorted_by_year = (df.groupby('yr')['loss'].sum()/ 100000000).sort_values(ascending=False)
injuries_sorted_by_year[:10].plot(kind='barh', color=sns.color_palette('Spectral'), figsize=(10, 7),  xlabel='Total Loss (in 100 millions)$')
<Axes: xlabel='Total Loss (in 100 millions)$', ylabel='yr'>
_images/6b07c519cc0d915903f7dfeca6d1e49b6d0f1b935b4ef12149ba2a0b4114578c.png
fig = px.scatter_3d(df, x='mag', y='loss', z='len', color='fat', title='3D Scatter Plot: Magnitude vs. Loss vs. Length')
fig.show()
fig = px.scatter_geo(df, lat='slat', lon='slon', title='Geographic Scatter Plot')
fig.update_geos(projection_type='natural earth')
fig.show()

MODELOS#

Modelo de Regresión#

# Select features (independent variables) and the target variable
X = df[['mag', 'slat', 'slon', 'elat', 'elon', 'len', 'wid']]
y = df['loss']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a linear regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# Make predictions on the test set
y_pred = model.predict(X_test)

Predicciones#

# Calculate the Mean Squared Error (MSE) and R-squared (R2) for evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")
Mean Squared Error: 535175920589345.81
R-squared (R2): 0.02
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Loss')
plt.ylabel('Predicted Loss')
plt.title('Actual vs. Predicted Loss')
plt.show()
_images/68357ea8f9be8b1788905b20dc65d0404f255c36247849c1cdcb50762c5d5131.png
from sklearn.impute import SimpleImputer

# Initialize the imputer
imputer = SimpleImputer(strategy='mean')  # You can use other strategies like 'median' or 'constant'

# Fit and transform the imputer on your data
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

# Create a HistGradientBoostingRegressor model
model = HistGradientBoostingRegressor()

# Train the model on the training data
model.fit(X_train, y_train)
HistGradientBoostingRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_pred = model.predict(X_test)
# Create a scatter plot to visualize the predictions vs. actual values
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Loss')
plt.ylabel('Predicted Loss')
plt.title('Actual vs. Predicted Loss (HistGradientBoostingRegressor)')
plt.show()
_images/24c311d4306b7a4dd78c886f0b79ead20ad81cb5a21d3ac8df35cfbbc9891b5c.png